In [3]:
import pandas as pd
import numpy as np
import os
import glob
import nltk.data
import nltk, re, pprint
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.metrics.pairwise import linear_kernel
import sqlite3
def connect_db():
return sqlite3.connect('/Users/sheldon/podcasts/test.db')
def create_df_object():
conn = sqlite3.connect('/Users/sheldon/podcasts/test.db')
df = pd.read_sql("select * from podcast",conn)
return df
df = create_df_object()
stop = set(stopwords.words('english'))
In [6]:
#df.head()
import psycopg2
import sys
from sqlalchemy import create_engine
engine = create_engine('postgresql://sheldon@localhost:5432/sheldon')
df1 = pd.read_sql("select * from podcasts",engine)
In [7]:
df1.query("select *")
In [3]:
def remove_stop_words(row):
tokens = word_tokenize(str(row))
tokens = [w for w in tokens if not w in stop]
tokens = [word for word in tokens if not "'" in word]
return ' '.join(tokens)
df['transcribed'] = df['transcribed'].apply(remove_stop_words)
texts = df.transcribed.tolist()
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] +=1
In [4]:
from gensim import corpora, models, similarities
import gensim
In [4]:
'''class MyCorpus(object):
def __iter__(self):
for doc in docs:
yield dictionary.doc2bow(doc.split())
corpus_mem_friendly = MyCorpus()
corpora.MmCorpus.serialize('corpus.mm',corpus_mem_friendly)
dictionary.save('words.dict')
df["review_text"] = df["transcribed"].map(lambda x: x.split(' '))
from gensim import corpora
dictionary = corpora.Dictionary(df["review_text"])
'''
Out[4]:
In [9]:
#load all the stuff
dictionary = corpora.Dictionary.load('models/words.dict')
corpus = corpora.MmCorpus.load('models/corpus.mm')
tfidf = gensim.models.tfidfmodel.TfidfModel.load('models/tfidf_model')
lsi = gensim.models.lsimodel.LsiModel.load('models/model.lsi')
index = similarities.MatrixSimilarity.load('models/corpus.index')
lda = gensim.models
#tfidf.save('tfidf_model')
lsi.save('models/model.lsi')
#tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=75)
corpus_lsi = lsi[corpus_tfidf]
In [8]:
def get_related_podcasts(index):
def getKey(item):
return item[1]
listOfTopics =
corpus = corpus_lsi[index]
corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
related_df = pd.DataFrame(corpus,columns=['index','score'])
final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
return final_df
get_related_podcasts(1)
In [11]:
lsi.show_topics(num_words=100)[1]
def getKey(item):
return item[1]
sorted(corpus_lsi[17], key=getKey,reverse=True)
sorted(lsi.show_topic(8,topn=100), key=getKey, reverse=True)
Out[11]:
In [246]:
top_topics
Out[246]:
In [216]:
corpus_lsi[1]
Out[216]:
In [9]:
def get_related_podcasts(index):
def getKey(item):
return item[1]
corpus = corpus_lsi[index]
corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
related_df = pd.DataFrame(corpus,columns=['index','score'])
final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
return final_df
related_podcasts = list(get_related_podcasts(1)['index'])
def get_topics_per_podcast(podcast_index):
topic_ids = [i for i in sorted(corpus_lsi[podcast_index], key=getKey, reverse=True) if i[1] > 0.10]
def get_topic_arrays(topic_ids):
x = []
for id in topic_ids:
list_of_words = sorted(lsi.show_topic(id[0], topn=5),key=getKey, reverse=True)
z = []
for word in list_of_words:
if word[1] > .05:
z.append(word)
x.append(z)
return x
topic_arrays = get_topic_arrays(topic_ids)
return topic_arrays
testing = [[related_podcasts[i],get_topics_per_podcast(related_podcasts[i])] for i in range(0, len(related_podcasts))]
In [12]:
testing
x = pd.DataFrame(testing, columns=['index','words'])
x.words.ix[0]
Out[12]:
In [150]:
def get_related_podcasts(query):
vec_box = dictionary.doc2bow(query.split())
vec_lsi = lsi[vec_box]
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])[0:10]
related_df = pd.DataFrame(sims,columns=['index','score'])
def get_related_podcasts_list(index):
def getKey(item):
return item[1]
corpus = corpus_lsi[index]
corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
related_df = pd.DataFrame(corpus,columns=['index','score'])
final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
return final_df
related_podcasts = list(get_related_podcasts_list(1)['index'])
def get_topics_per_podcast(podcast_index):
topic_ids = [i for i in sorted(corpus_lsi[podcast_index], key=getKey, reverse=True) if i[1] > 0.10]
def get_topic_arrays(topic_ids):
x = []
for id in topic_ids:
list_of_words = sorted(lsi.show_topic(id[0], topn=5),key=getKey, reverse=True)
z = []
for word in list_of_words:
if word[1] > .05:
z.append(word)
x.append(z)
return x
topic_arrays = get_topic_arrays(topic_ids)
return topic_arrays
topics_per_podcast = [[related_podcasts[i],get_topics_per_podcast(related_podcasts[i])] for i in range(0, len(related_podcasts))]
other_df = pd.DataFrame(topics_per_podcast, columns=['topic_index','words'])
final_df = pd.merge(related_df, df)
test_final_df = pd.merge(other_df, final_df,left_index=True,right_index=True)[['words','index','score','episode','series']]
return test_final_df
In [161]:
x = get_related_podcasts('cats')
zz = x.words.ix[0]
In [172]:
zz[1]
Out[172]:
In [151]:
test
Out[151]:
In [146]:
tf = TfidfVectorizer(stop_words=stop)
tfidf_matrix = tf.fit_transform(df['transcribed'])
copy_matrix = tf.transform(df['transcribed'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
In [130]:
query = 'python economics love'
trans_query = query.lower()
trans_query = query.split()
tfidf_matrix_test = tf.fit_transform(trans_query)
tfidf_matrix_train = tf.transform(df['transcribed'])
tfidf_matrix_train.todense()
tfidf_matrix_test.todense()
query_similarities = linear_kernel(tfidf_matrix_test, tfidf_matrix_train)
query_similarities = query_similarities.argsort()[0][::-1]
pod_dict = dict(zip(range(0, len(query_similarities)),query_similarities))
pod_dict = pd.DataFrame({'rank':pod_dict.keys()}, index=pod_dict.values())
#related_podcasts_df = pd.DataFrame.join(pod_dict, df, how='inner')
#final_df = related_podcasts_df.sort_values('rank')[1:11][['rank','episode','series']]
#related_podcasts = final_df['episode']
In [131]:
pod_dict
Out[131]:
In [ ]: